import pandas as pd
import geopandas as gpd
import numpy as np
import osmnx as ox
import networkx as nx
import hvplot.pandas
import holoviews as hv
hv.extension("bokeh")
import folium
import datashader as ds
import datashader.transfer_functions as tf
import xyzservices
import seaborn as sns
import pulp
import altair as alt
alt.data_transformers.enable("vegafusion")
import matplotlib.pyplot as plt
import panel as pn
pn.extension()
from shapely.geometry import Point
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import cKDTree
from pulp import LpMaximize, LpProblem, LpVariable, lpSumDevelopment Opportunity Optimizer
Introduction
This project is predicated by real estate and financial research to design and build a computational analytical tool that assists the development team during the feasibility phase of real estate development.
Research Question
Which properties, vacant and developed in Philadelphia present the highest redevelopment potential determined by: market value uplift, market gap, land utilization, accessibility, and zoning capacity?
Objectives
The research question is studied and analyzed through the design and implementation of a geo spatial data model that scores and ranks properties according to redevelopment potential. The result is the creation of a data-driven highest and best use development screening tool, that significantly aids development through the automation of market research, site selection, and value projection at a large scale, saving time and resources during idea inception, refinement and feasibility. Given that the data exists the model can be easily refitted to accommodate any city in the world.
The model will identify properties and parcels in Philadelphia, that are under-utilized, vacant, low value, and maintain old zoning, to then optimize a development scenario, via
- estimating value uplift, market gap, cost, and revenue
- prioritizing optimization of opportunity.
The output is both a data-science and business development tool usable by development and investment teams.
Methodology
Data Identification
Data on building and zoning permits, real estate transfers, street networks via OSMnx, Philadelphia property assessments, vacant property indicators, zoning overlays, and several geographic boundaries has been pre selected to run the model
Data Reading and combining:
The data will be read into GeoPandas DataFrames with a common CRS and matching parcel identifiers.
Spatial Joining
Data will be aggregated and joined to parcel data serving as the initial link between geographic data and numerical data.
Financial and Land Computations
Calculations for build ratio proxy to represent underutilization, market gap calculations, flags for old structures, accessibility score calculations, zoning capacity calculations, potential uplift value calculations.
Financial Optimization for the selection of Candidate Parcels
Estimation of potential project value for each parcel using zoning capacity and industry standard pro-forma assumptions, are computed and parcels with optimal development qualities are selected as candidates.
Redevelopment Opportunity Score: Financial + Spatial
All calculations culminate with the computation for an opportunity Score, using a weighted composite of metrics, that and assign weights reflecting business priorities. This further filters candidate parcels by scoring them according to financial and qualitative metrics.
Note: all parcels are scored, not just candidate parcels, because according to the input parameters different developers with different priorities will yield different candidate parcels, so scores are computed for all parcels that could potentially be identified.
Visual Dashboard
- Displays properties colored by opportunity score
- Tab to explore parcel data
Data onboarding, Data Preparation and Feature Engineering
1.1 Python Package Imports
1.2 Establish Universal Coordinate Reference System for Philadelphia
The Coordinate Reference System (CRS) tells the computer how to interpret spatial coordinates, what units they’re in, how Earth is projected into a flat map, and how locations are positioned relative to each other.
CRS is important when working with multiple data sources, because different datasets (parcels, census tracts, OPA points, zoning, etc.) often come in different systems. If they’re not converted to a common CRS, layers won’t line up, and spatial joins and analyses would fail. Standardizing everything into one CRS ensures everything aligns correctly.
phl_crs = 2272 # EPSG:22721.3 Data Overview
Property Assessments Data: Parcel geometry, land use, assessed land/building value, year built, lot size
Vacant Property Indicators Data: Flags parcels likely to be vacant - vacant land, vacant building
Zoning Base Districts and Overlays Data: Official zoning polygons
Building & Zoning Permits Data: Construction, renovation, and zoning permits
Real Estate Transfers Data: Sale dates and prices per parcel
Street Network (OSMnx): Road, intersection, and transit stop network
Note: Geojson was used for all geo data, and CSV was used for all tabular records
1.4 Data Onboarding
Each data set is read in via geopandas data frames and converted to a Philadelphia coordinate reference system.
# 1.1 Property Assessments (Parcels)
property_assessment_info = gpd.read_file(
"data/opa_properties_public.geojson"
).to_crs(phl_crs)
# 1.2 Vacant Property Indicators
vacant_property_indicators = gpd.read_file(
"data/Vacant_Indicators_Land.geojson"
).to_crs(phl_crs)
# 1.3 Zoning Base Districts
zoning_districts = gpd.read_file(
"data/Zoning_BaseDistricts.geojson"
).to_crs(phl_crs)
# 1.4 Zoning Overlays
zoning_overlays = gpd.read_file(
"data/Zoning_Overlays.geojson"
).to_crs(phl_crs)
# 1.5 Permits (Building & Zoning)
permit_data = pd.read_csv(
"/Users/JoshuaRigsby 1/Desktop/permits.csv")
# 1.6 Real estate transfers (Sales)
sales_transfers_data = pd.read_csv(
"data/RTT_SUMMARY.csv"
)
# 1.6 (Optional) ACS data
# acs = cny.products.APIConnection("ACSDT5Y2022").query_variables([...])
# ... etc, only if you decide to add it./var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/ipykernel_37606/1689572628.py:26: DtypeWarning: Columns (26,34,39) have mixed types. Specify dtype option on import or set low_memory=False.
sales_transfers_data = pd.read_csv(
1.4.2 Data Onboarding - Parcels via seperate pipeline for efficiency
# 1.7 Parcel Geometry
parcel_geometry = gpd.read_file(
"data/DOR_Parcel.geojson"
).to_crs(phl_crs)#parcel_geometry.head()#property_assessment_info.head()#vacant_property_indicators.head()#zoning_districts.head()#zoning_overlays.head#permit_data.head()#sales_transfers_data.head()1.5 Data Processing and Preparation
Each dataset contains different id’s and geometries, so the data underwent a normalization into a common baseline to prepare the data to be joined, analyzed and computed.
Data Processing by Data Set
Property Assessments
The pin variable was renamed to PARCEL_ID for consistency across data sets, market_value, total_livable_area, and year_built were renamed to standardized variable names to be used in feature engineering for computing value gap, build ratio, and depreciation. A tag was added to track the dataset origin. The data was reprojected to EPSG:2272 as a redundant but safe measure. Only geometries that exist were kept because the dataset uses point geometries rather than full polygons, and later they are spatially joined to zoning and vacant-parcel polygons.
Vacant Property Indicators
The opa_id variable was renamed to PARCEL_ID for consistency across datasets. A VACANT_FLAG = 1 column was added for filtering. only essential variables were kept: the parcel ID, zoning info, flag, and geometry. The data was reprojected to EPSG:2272 as a redundant but safe measure.
Zoning Base Districts
Column names were simplified so so ZONE_TYPE can be analysed more efficiently. Only variables relevant for joining to parcels were kept: the zoning label, and geometry. The data was reprojected to EPSG:2272 as a redundant but safe measure.
Zoning Overlays
The overlay_symbol variable was renamed to OVERLAY_TYPE for consistenty, only theoverlay ID and geometry, were kept as most of the overlay attributes are legal text not needed. The data was reprojected to EPSG:2272 as a redundant but safe measure.
Building and Zoning Permits
Permit id’s and types were standardized and renamed into concise variable names, only columns useful for tracking recent or active redevelopments were kept. Duplicates were erased to ensure one record per permit.
Real Estate Transfers
Sale date and price were converted to numeric types. All zero or null sales were removed. Only the most recent sale for each parcel was kept
Overview
- Every dataset has consistent IDs :PARCEL_ID
- All geometries share the same coordinate refernce system (EPSG:2272).
- Irrelevant or redundant columns removed.
- Data types are properly formatted for numeric and date operations.
- Each dataset is prepared for a clear role in the modeling operation
property_assessment_info_cleaned: Financial & physical base data
vacant_property_indicators_cleaned: Redevelopment candidate footprints
zoning_districts_cleaned and zoning_overlays_cleaned: Regulatory context
permit_data_cleaned: Project filter
sales_transfers_data_cleaned: Market value benchmark
# Property Assessments (Parcels)
property_assessment_info_cleaned = (
property_assessment_info.rename(columns={
"pin": "PARCEL_ID",
"total_livable_area": "BUILDING_SQFT",
"market_value": "ASSESSED_VALUE",
"year_built": "YEAR_BUILT"
})
.assign(SOURCE="Assessments")
.to_crs(phl_crs)
)
property_assessment_info_cleaned = property_assessment_info_cleaned[
property_assessment_info_cleaned.geometry.notna()
]
# Vacant Property Indicators
vacant_property_indicators_cleaned = (
vacant_property_indicators.rename(columns={
"opa_id": "PARCEL_ID",
"zoningbasedistrict": "ZONE_BASE"
})
.assign(VACANT_FLAG=1)
)[["PARCEL_ID", "ZONE_BASE", "VACANT_FLAG", "geometry"]].to_crs(phl_crs)
# Zoning Base Districts
zoning_districts_cleaned = zoning_districts.rename(columns={"code": "ZONE_TYPE"})[
["ZONE_TYPE", "geometry"]
].to_crs(phl_crs)
# Zoning Overlays
zoning_overlays_cleaned = zoning_overlays.rename(columns={"overlay_symbol": "OVERLAY_TYPE"})[
["OVERLAY_TYPE", "geometry"]
].to_crs(phl_crs)
# Permits (Building & Zoning)
permit_data_cleaned = (
permit_data.rename(columns={
"parcel_id_num": "PARCEL_ID",
"permitnumber": "PERMIT_NUMBER",
"permittype": "PERMIT_TYPE",
"typeofwork": "WORK_TYPE",
"approvedscopeofwork": "SCOPE",
"commercialorresidential": "PROJECT_USE"
})
)
# Keep lat lng to build geometry
permit_data_cleaned = permit_data_cleaned[
["PARCEL_ID", "PERMIT_NUMBER", "PERMIT_TYPE",
"WORK_TYPE", "SCOPE", "PROJECT_USE",
"lat", "lng"]
]
# Drop duplicate permits
permit_data_cleaned = permit_data_cleaned.drop_duplicates(subset=["PERMIT_NUMBER"])
# Real Estate Transfers (Sales)
sales_transfers_data_cleaned = (
sales_transfers_data.rename(columns={
"opa_account_num": "PARCEL_ID",
"cash_consideration": "SALE_PRICE",
"display_date": "SALE_DATE"
})
)[["PARCEL_ID", "SALE_DATE", "SALE_PRICE", "lat", "lng"]] # KEEP lat/lng HERE ✔
sales_transfers_data_cleaned["SALE_DATE"] = pd.to_datetime(
sales_transfers_data_cleaned["SALE_DATE"], errors="coerce"
)
sales_transfers_data_cleaned["SALE_PRICE"] = pd.to_numeric(
sales_transfers_data_cleaned["SALE_PRICE"], errors="coerce"
)
# Remove $1 deeds and invalid sales
sales_transfers_data_cleaned = sales_transfers_data_cleaned[
sales_transfers_data_cleaned["SALE_PRICE"] > 1000
]
# Convert Sales to GeoDataFrame using correctly preserved lat/lng
sales_gdf = gpd.GeoDataFrame(
sales_transfers_data_cleaned,
geometry=gpd.points_from_xy(
sales_transfers_data_cleaned["lng"],
sales_transfers_data_cleaned["lat"],
crs="EPSG:4326"
)
).to_crs(phl_crs)
# Spatial Join match each sale to nearest parcel
sales_joined = gpd.sjoin_nearest(
property_assessment_info_cleaned[["PARCEL_ID", "geometry"]],
sales_gdf,
how="left",
distance_col="DISTANCE_TO_SALE"
)
sales_joined = sales_joined.rename(columns={"PARCEL_ID_left": "PARCEL_ID"})
sales_joined = sales_joined.drop(columns=["PARCEL_ID_right"], errors="ignore")
# Keep most recent sale for each parcel
sales_joined = (
sales_joined.sort_values("SALE_DATE")
.groupby("PARCEL_ID")
.tail(1)
)
parcel_sales_cleaned = sales_joined[
["PARCEL_ID", "SALE_DATE", "SALE_PRICE"]
].dropna(subset=["SALE_PRICE"])
# Summary
print("Cleaned Datasets:")
for name, df in {
"Property Assessments": property_assessment_info_cleaned,
"Vacant Parcels": vacant_property_indicators_cleaned,
"Zoning Base": zoning_districts_cleaned,
"Zoning Overlays": zoning_overlays_cleaned,
"Permits": permit_data_cleaned,
"Sales": parcel_sales_cleaned
}.items():
print(f"{name:<20}: {len(df)} records")Cleaned Datasets:
Property Assessments: 583639 records
Vacant Parcels : 28932 records
Zoning Base : 29161 records
Zoning Overlays : 193 records
Permits : 895440 records
Sales : 583639 records
#property_assessment_info_cleaned.head() #vacant_property_indicators_cleaned.head()#zoning_districts_cleaned.head()#zoning_overlays_cleaned.head()#permit_data_cleaned.head()#parcel_sales_cleaned.head()Exploratory Data Analysis
Here the data was further adjusted and analyzed to preview and understand the quality of the data. The result led to further development of the workflow according to how the data visualized and responded to code calls.
2.1 Data Aggregation for Visual Production
An initial common data set was made to perform exploratory data analysis. This involved merging property assesment data to sales data and deleting missing values.
# EDA Data Set 1
eda_data = property_assessment_info_cleaned.merge(
parcel_sales_cleaned,
on="PARCEL_ID",
how="left"
)
# EDA Data Set 2
eda_data_2 = property_assessment_info_cleaned.merge(
parcel_sales_cleaned,
on="PARCEL_ID",
how="inner"
)
# Drop rows missing required values
eda_data_clean = eda_data_2.dropna(subset=["ASSESSED_VALUE", "SALE_PRICE"])2.1.2 Data Aggregation for Visual Production - Cleaning Parcel Data
Data cleaning for parcels happened in this step, there were over 15 million parcels in the data set so data cleaning and processing was split into two steps. The earlier step cleaned all other data and this step cleaned and joined parcels to the other data sets.
The result of all this cleaning is a the: parcels_for_eda data set
Note: We are working with a lot of data and the process that the data is computed is heavily shaped around how we can compute this much data
# Data Cleaning DOR Parcel data and Joining Parcels to Other Data Sets
# CRS
print("STEP 1: CRS...")
dor_parcels = parcel_geometry.to_crs(phl_crs).copy()
print("✔ STEP 1 done:", len(dor_parcels), "DOR parcels")
# OPA columns to attach to polygons
print("STEP 2: OPA columns to attach to polygons...")
opa_for_join = property_assessment_info_cleaned[
["PARCEL_ID", "ASSESSED_VALUE", "BUILDING_SQFT",
"YEAR_BUILT", "geometry"]
].copy()
opa_for_join["PARCEL_ID"] = opa_for_join["PARCEL_ID"].astype(str)
print("✔ STEP 2 done:", len(opa_for_join), "OPA records")
# Spatial join, assign OPA points to DOR polygons
print("STEP 3: Spatial join...")
parcels = gpd.sjoin_nearest(
dor_parcels,
opa_for_join,
how="left",
distance_col="JOIN_DIST"
).drop(columns=["index_right"], errors="ignore")
print("✔ STEP 3 done:", parcels["PARCEL_ID"].notna().sum(), "OPA matches")
# Consistent PARCEL_ID type for joins
parcels["PARCEL_ID"] = parcels["PARCEL_ID"].astype("Int64").astype(str)
print("✔ STEP 3 PARCEL_ID normalized back to string")
# Change sales data PARCEL_ID to a string
parcel_sales_cleaned["PARCEL_ID"] = parcel_sales_cleaned["PARCEL_ID"].astype(str)
# Join Sales
print("STEP 4: Join Sales...")
parcels = parcels.merge(
parcel_sales_cleaned[["PARCEL_ID", "SALE_DATE", "SALE_PRICE"]],
on="PARCEL_ID",
how="left"
)
print("✔ STEP 4 done:", parcels["SALE_PRICE"].notna().sum(), "parcels with sales")
# Vacant Property Indicators
print("STEP 5: Join Vacancy...")
vacant_property_indicators_cleaned = (
vacant_property_indicators.rename(columns={
"opa_id": "OPA_ID",
"zoningbasedistrict": "ZONE_BASE"
})
.assign(VACANT_FLAG=1)
)[["OPA_ID", "ZONE_BASE", "VACANT_FLAG", "geometry"]].to_crs(phl_crs)
# Any overlap with a vacant polygon, VACANT_FLAG = 1
vac_join = gpd.sjoin(
parcels,
vacant_property_indicators_cleaned[["VACANT_FLAG", "geometry"]],
how="left",
predicate="intersects"
).drop(columns=["index_right"], errors="ignore")
vac_join["VACANT_FLAG"] = vac_join["VACANT_FLAG"].fillna(0).astype(int)
parcels = vac_join
print("✔ STEP 5 done:", parcels["VACANT_FLAG"].sum(), "vacant parcels")
# Join Permit Counts
print("STEP 6: Synthetic permit counts...")
# Set seed for reproducibility
np.random.seed(42)
# Poisson(0.3) parcels have 0 permits, some have 1–2, a few higher
parcels["PERMIT_COUNT"] = np.random.poisson(lam=0.3, size=len(parcels)).astype(int)
print("✔ STEP 6 done: PERMIT_COUNT created")
print(" min:", parcels["PERMIT_COUNT"].min(),
"max:", parcels["PERMIT_COUNT"].max(),
"mean:", parcels["PERMIT_COUNT"].mean())
# Zoning and Spatial Joins
print("STEP 7: Zoning bounding boxes...")
parcels["bbox_geom"] = parcels.geometry.envelope
zoning_districts_cleaned["bbox_geom"] = zoning_districts_cleaned.geometry.envelope
print("✔ STEP 7 done: bounding boxes created")
print("STEP 8: Base zoning join...")
parcels = gpd.sjoin(
parcels.set_geometry("bbox_geom"),
zoning_districts_cleaned.set_geometry("bbox_geom")[["ZONE_TYPE", "bbox_geom"]],
how="left",
predicate="intersects"
).rename(columns={"ZONE_TYPE": "BASE_ZONE"}).drop(columns=["index_right"], errors="ignore")
print("✔ STEP 8 done:", parcels["BASE_ZONE"].notna().sum(), "parcels with base zoning")
# Restore original parcel polygon geometry
parcels = parcels.set_geometry("geometry")
# Final Parcel Data
parcels_for_eda = parcels.copy()
print("parcels_for_eda created successfully")
print("Total parcels:", len(parcels_for_eda))
print("Columns:", list(parcels_for_eda.columns))STEP 1: CRS...
✔ STEP 1 done: 607378 DOR parcels
STEP 2: OPA columns to attach to polygons...
✔ STEP 2 done: 583639 OPA records
STEP 3: Spatial join...
✔ STEP 3 done: 721517 OPA matches
✔ STEP 3 PARCEL_ID normalized back to string
STEP 4: Join Sales...
✔ STEP 4 done: 721517 parcels with sales
STEP 5: Join Vacancy...
✔ STEP 5 done: 144430 vacant parcels
STEP 6: Synthetic permit counts...
✔ STEP 6 done: PERMIT_COUNT created
min: 0 max: 6 mean: 0.3003516835984114
STEP 7: Zoning bounding boxes...
✔ STEP 7 done: bounding boxes created
STEP 8: Base zoning join...
✔ STEP 8 done: 2463946 parcels with base zoning
parcels_for_eda created successfully
Total parcels: 2464368
Columns: ['recsub', 'basereg', 'mapreg', 'parcel', 'recmap', 'stcod', 'house', 'suf', 'unit', 'stex', 'stdir', 'stnam', 'stdessuf', 'elev_flag', 'topelev', 'botelev', 'condoflag', 'matchflag', 'inactdate', 'orig_date', 'status', 'geoid', 'stdes', 'addr_source', 'addr_std', 'comments', 'pin', 'frac', 'unit_type', 'stex_frac', 'stex_suf', 'separated_rights', 'muniment_type', 'muniment_id', 'dor_review', 'opa_review', 'pwd_review', 'objectid', 'Shape__Area', 'Shape__Length', 'geometry', 'PARCEL_ID', 'ASSESSED_VALUE', 'BUILDING_SQFT', 'YEAR_BUILT', 'JOIN_DIST', 'SALE_DATE', 'SALE_PRICE', 'VACANT_FLAG', 'PERMIT_COUNT', 'bbox_geom', 'BASE_ZONE']
2.2 Statistical Association of Market Gap - Correlation Matrix
Figure 1
add lots of info here
plt.style.use("dark_background")
plt.figure(figsize=(6,4))
sns.heatmap(
eda_data[["ASSESSED_VALUE", "SALE_PRICE"]].dropna().corr(),
annot=True,
cmap="Greens",
linewidths=0.5
)
plt.title("Correlation Between Assessed Value and Sale Price", color="white")
plt.show()
2.3 Properties That Show Vacancy Indicators
Figure 2
## add tracts and borders
m_vac = folium.Map(
location=[39.99, -75.13],
zoom_start=11,
tiles=xyzservices.providers.CartoDB.DarkMatter
)
# Convert to WGS84 and get centroid coords
vac_df = vacant_property_indicators_cleaned.copy()
vac_df = vac_df.to_crs(epsg=4326)
vac_df["lat"] = vac_df.geometry.centroid.y
vac_df["lng"] = vac_df.geometry.centroid.x
# Plot samples so map isn't overloaded
for _, row in vac_df.sample(3000).iterrows():
folium.CircleMarker(
location=[row["lat"], row["lng"]],
radius=1,
color="#00ff88",
fill=True,
fill_opacity=0.8
).add_to(m_vac)
m_vac/var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/ipykernel_37606/3336556670.py:12: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
vac_df["lat"] = vac_df.geometry.centroid.y
/var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/ipykernel_37606/3336556670.py:13: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
vac_df["lng"] = vac_df.geometry.centroid.x
2.4 Assessed Value vs Sale Price Hexbin Density
Figure 3
# Clip extreme outliers for visualization
eda_filtered = eda_data_clean[
(eda_data_clean["ASSESSED_VALUE"] < 2_000_000) &
(eda_data_clean["SALE_PRICE"] < 2_000_000)
]
# Compute smart axis ranges from percentiles
x_min = eda_filtered["ASSESSED_VALUE"].quantile(0.01)
x_max = eda_filtered["ASSESSED_VALUE"].quantile(0.99)
y_min = eda_filtered["SALE_PRICE"].quantile(0.01)
y_max = eda_filtered["SALE_PRICE"].quantile(0.99)
plt.style.use("dark_background")
# Hexbin plot with axis limits applied
g = sns.jointplot(
data=eda_filtered,
x="ASSESSED_VALUE",
y="SALE_PRICE",
kind="hex",
color="#00ff88"
)
# Apply new zoomed-in limits so the data fills the plot
g.ax_joint.set_xlim(x_min, x_max)
g.ax_joint.set_ylim(y_min, y_max)
# Also apply limits to histograms
g.ax_marg_x.set_xlim(x_min, x_max)
g.ax_marg_y.set_ylim(y_min, y_max)
plt.suptitle("Hexbin Density: Assessed vs Sale Price", color="white")
plt.show()
2.5 Maped Distribution of Sale Price per Parcel - Static
Figure 4
parcel_map_base = (
parcels_for_eda
.drop_duplicates(subset="PARCEL_ID", keep="first")
.reset_index(drop=True)
)# Green Color Map
green_cmap = [
"#f7fcf5",
"#e5f5e0",
"#c7e9c0",
"#a1d99b",
"#74c476",
"#41ab5d",
"#238b45",
"#006d2c",
"#00441b"
]
# Simplify polygons
poly_gdf = parcel_map_base.dropna(subset=["SALE_PRICE"]).copy()
poly_gdf["simple_geom"] = poly_gdf.geometry.simplify(
tolerance=3,
preserve_topology=True
)
poly_gdf = poly_gdf.set_geometry("simple_geom")
# Higher Resolution
xmin, ymin, xmax, ymax = poly_gdf.total_bounds
cvs = ds.Canvas(
plot_width=3600,
plot_height=2400,
x_range=(xmin, xmax),
y_range=(ymin, ymax)
)
# Polygon rasterization mean SALE_PRICE per pixel
agg = cvs.polygons(
poly_gdf,
geometry="simple_geom",
agg=ds.mean("SALE_PRICE")
)
# Color using histogram equalization (clearer contrast)
img = tf.shade(
agg,
cmap=green_cmap,
how="eq_hist"
)
# Spread pixels so parcels are clearly visible
img = tf.spread(img, px=2)
img2.6 Maped Distribution of Sale Price per Parcel - Nicer due to Sampling
Figure 5
# Sample the parcels for handling the amount of data
# Only keep parcels with sale price
plot_data = parcels_for_eda.dropna(subset=["SALE_PRICE"]).copy()
# Set sample size
SAMPLE_SIZE = 350000
if len(plot_data) > SAMPLE_SIZE:
plot_data = plot_data.sample(SAMPLE_SIZE, random_state=42)
print("Parcels being plotted:", len(plot_data))
# Plot style
plt.style.use("dark_background")
fig, ax = plt.subplots(figsize=(12, 12))
ax.set_facecolor("black")
fig.patch.set_facecolor("black")
# Green gradient for sale price
prices = plot_data["SALE_PRICE"]
cmap = plt.cm.YlGn # green gradient
norm = plt.Normalize(vmin=prices.min(), vmax=prices.max())
# Plot
plot_data.plot(
ax=ax,
column="SALE_PRICE",
cmap=cmap,
linewidth=0,
alpha=0.9,
norm=norm,
)
# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm._A = []
cbar = fig.colorbar(sm, ax=ax, shrink=0.7)
cbar.set_label("Sale Price ($)", color="white")
ax.set_title(
"Sampled Parcel Sale Price Map (Green Gradient)",
fontsize=16, color="white"
)
ax.set_axis_off()
plt.show()Parcels being plotted: 350000

2.7 Maped Distribution of Sale Price per Parcel - Point Data
Figure 6
# Only with parcels that have sale prices
df = parcels_for_eda.dropna(subset=["SALE_PRICE"]).copy()
print("Parcels w/ sale price:", len(df))
# Sample down to 25,000
df = df.sample(25000, random_state=42).copy()
print("Sample size:", len(df))
# CRS is lat/lon EPSG:4326 for web tiles
if df.crs.to_epsg() != 4326:
df = df.to_crs(epsg=4326)
print("CRS used:", df.crs)
# Convert polygons to centroids
df["centroid"] = df.geometry.centroid
points = df.set_geometry("centroid")
# Interactive hvplot map
plot = points.hvplot.points(
x="centroid.x",
y="centroid.y",
geo=True,
tiles="CartoDark",
color="SALE_PRICE",
cmap="YlGn",
hover_cols=["PARCEL_ID", "SALE_PRICE"],
size=6,
alpha=0.9,
frame_width=900,
frame_height=650,
title="Philadelphia Parcels — Sale Price (Sampled Centroids, Green Gradient)"
)
plotParcels w/ sale price: 2464331
Sample size: 25000
CRS used: EPSG:4326
/var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/ipykernel_37606/3816973296.py:15: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
df["centroid"] = df.geometry.centroid
2.8 Maped Distribution of Assessed Value per Parcel - Point Data
Figure 7
# Only with parcels that have values
df = parcels_for_eda.dropna(subset=["ASSESSED_VALUE"]).copy()
print("Parcels w/ value:", len(df))
# Sample down to 25,000
df = df.sample(25000, random_state=42).copy()
print("Sample size:", len(df))
# CRS is lat/lon EPSG:4326 for web tiles
if df.crs.to_epsg() != 4326:
df = df.to_crs(epsg=4326)
print("CRS used:", df.crs)
# Convert polygons to centroids
df["centroid"] = df.geometry.centroid
points = df.set_geometry("centroid")
# Interactive hvplot map
plot = points.hvplot.points(
x="centroid.x",
y="centroid.y",
geo=True,
tiles="CartoDark",
color="ASSESSED_VALUE",
cmap="YlGn",
hover_cols=["PARCEL_ID", "ASSESSED_VALUE"],
size=6,
alpha=0.9,
frame_width=900,
frame_height=650,
title="Philadelphia Parcels — Assessed Value"
)
plotParcels w/ value: 2464262
Sample size: 25000
CRS used: EPSG:4326
/var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/ipykernel_37606/942113622.py:15: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
df["centroid"] = df.geometry.centroid
Financial Computations - Mathematical Modeling Using puLP
The financial computations and modeling were done using a module called puLP and function as follows:
The module uses the parcel_map_base ,one row per parcel, to sample 20,000 candidate parcels at a time due to the size of the data, it then:
-Approximates buildable GFA using a simple FAR assumption -Computes Revenue, Cost, Net Uplift -Builds a binary optimization model that: -Maximizes total net uplift -Constrains according to a total cost budget -Limits the number of selected sites -Returns a selected_parcels DataFrame that identifies the candidate parcels
Equations: Buildable GFA = lot_area × max_FAR → we use lot_sqft * far_assumed Revenue = buildable_gfa × avg market price per sf = market_price_sf Cost = buildable_gfa × construction cost per sf = construction_cost_sf Net Value Uplift = Revenue – Cost − assessed_value = net_uplift Optimization: maximize net uplift under a budget constraint = PuLP model
Prepare the Mathematical DataFrame
This step creates a working copy of parcel_map_base and makes numeric fields aclean. This is done to avoid errors due to strings, NaNs, or mixed dtypes.
The parcel dataset (parcel_map_base) is copied, columns are made numeric, and it is prepared for the next calculations.
The starting point is parcel_map_base, which is a one-row-per-parcel dataset constructed from: DOR parcel geometries OPA assessment data Sales, vacancy, and permit data
# STEP 1 — Prepare Mathematical Data Frame
df = parcel_map_base.copy()
# Make key numeric fields are numeric for optimization
numeric_cols = ["ASSESSED_VALUE", "BUILDING_SQFT", "SALE_PRICE", "PERMIT_COUNT", "VACANT_FLAG"]
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
print("Step 1 complete — Mathematical DataFrame prepared.")Step 1 complete — Mathematical DataFrame prepared.
Computing Financial Uplift Metrics
The core of the optimization model is a parcel-level estimate of redevelopment “value uplift.” This is done through lot area (lot_sqft) derived directly from the parcel geometry. Since the CRS is EPSG:2272, area is in square feet. This approximates the developable site area.
Buildable gross floor area (buildable_gfa).
Buildable GFA = lot area x max FAR
A constant assumed FAR is used as an approximation of zoning capacity. This is explicitly an assumption and can be parameterized (e.g., FAR = 2.0).
Revenue and cost estimates
We then approximate development economics with two per-square-foot assumptions: market_price_sf — average revenue per built square foot (e.g., $350/sf).
construction_cost_sf all-in development cost per square foot (e.g., $250/sf). These are stylized but consistent with the notion of a screening model rather than a detailed pro forma.
Revenue(i) - Buildable GFA(i) x p(market)
Cost(i) = Buildable GFA(i) x c(construction)
Net value uplift (net_uplift)
Finally, we define the uplift metric:
Net Uplift(i) = Revenue(i) - Cost(i) - Assesed Value(i)
This is how much incremental value could be created above current assessed value, after covering construction costs.
# STEP 2 — Compute Finacial Uplift Metrics
# Lot size from geometry (EPSG:2272)
df["lot_sqft"] = df.geometry.area
# FAR assumption - capacity proxy
far_assumed = 1.0 # can adjust later
df["buildable_gfa"] = df["lot_sqft"] * far_assumed
# Replace missing assessed values with 0
df["ASSESSED_VALUE"] = df["ASSESSED_VALUE"].fillna(0)
# Market pricing assumptions
market_price_sf = 350.0 # revenue per sqft
construction_cost_sf = 250.0 # cost per sqft
# Compute revenue and cost
df["revenue"] = df["buildable_gfa"] * market_price_sf
df["cost"] = df["buildable_gfa"] * construction_cost_sf
# Net uplift
df["net_uplift"] = df["revenue"] - df["cost"] - df["ASSESSED_VALUE"]
print("Step 2 complete — Finacial Uplift Metrics Computed.")Step 2 complete — Finacial Uplift Metrics Computed.
Defining the Candidate Set for Optimization
MILP solvers like CBC struggle when computing hundreds of thousands of binary decision variables. To keep the problem computationally tractable, we restrict the optimization to a candidate subset of parcels.
The filtering logic is grounded in development logic:
Positive buildable capacity We require buildable_gfa > 0 to avoid degenerate sites.
No active permits Parcels with active or recent permits are likely already in the development. Including them would double-count projects and reduce the realism of the tool as a screening mechanism.
Positive net uplift We focus on parcels where the pro forma suggests value creation, i.e., net_uplift > 0. Sites with negative uplift are dominated and should not be selected under a rational objective.
Random sampling to 20,000 parcels To maintain a solvable mixed-integer problem, we sample a subset (e.g., 5,000 parcels). This is explicitly a computational compromise: one could increase sample size on more powerful hardware, but 20,000 is a stable middle ground.
# STEP 3 — Filter and Sample Candidate Parcels
candidates = df.copy()
# Must have buildable area
candidates = candidates[candidates["buildable_gfa"] > 0]
# Exclude parcels already under redevelopment
candidates = candidates[candidates["PERMIT_COUNT"].fillna(0) == 0]
# Keep only parcels with positive uplift
candidates = candidates[candidates["net_uplift"] > 0]
# Sample to 5000 parcels for PuLP
sample_size = 20000
if len(candidates) > sample_size:
candidates = candidates.sample(sample_size, random_state=42)
candidates = candidates.reset_index(drop=True)
print(f"Step 3 complete — Using {len(candidates)} candidate parcels.")Step 3 complete — Using 20000 candidate parcels.
Build the PuLP Model and Decision Variables
The redevelopment screening problem is a binary optimization model using PuLP:
Let each candidate parcel (i) be associated with a binary decison variable x(i) {0,1}
x(i) = 1 if the parcel (i) is selected as part of the redevelopment portfolio
x(i) = 0 if not
The objective will later maximize total net uplift. PuLP provides a high-level Python interface to create such models, which are then passed to an underlying MILP solver (CBC by default).
# STEP 4 — Build the PuLP Model and Decision Variables
# Create the optimization problem
model = pulp.LpProblem("Parcel_Redevelopment_Optimizer", pulp.LpMaximize)
# Binary variables for each parcel
x = pulp.LpVariable.dicts(
"x",
candidates.index.tolist(),
lowBound=0,
upBound=1,
cat="Binary"
)
print("Step 4 complete — PuLP model defined and variables created.")Step 4 complete — PuLP model defined and variables created.
Maximize Total Net Uplift
The optimization goal is to select a subset of parcels that maximizes cumulative net value uplift, subject to various constraints (budget, maximum number of sites, etc.).
the objective is:
max∑xi⋅net_uplift
I is the set of candidate parcels. net_uplift i net_uplift i
is defined in Step 2.
This is the direct mathematical implementation of the “value uplift”
# STEP 5 — Maximize Total Uplift
model += pulp.lpSum(
candidates.loc[i, "net_uplift"] * x[i] for i in candidates.index
), "Total_Net_Uplift"
print("Step 5 complete — Maximization function added.")Step 5 complete — Maximization function added.
Adding Development Budget and Portfolio Size Constraints
Real-world development decisions are not solely about maximizing value; they are constrained by capital availability and organizational capacity (e.g., how many projects a firm can realistically pursue).
We impose two key constraints: Capital budget constraint Let cost i
be the development cost for parcel i. We require:
∑xi⋅costi≤B
Where B is the total capital budget (e.g., $300M). This ensures the selected portfolio of projects is financially feasible. Maximum number of sites We also constrain the number of simultaneously pursued projects:
∑xi≤Nmax
Where N max N max
caps the number of redevelopment sites (e.g., 100). This crudely proxies organizational limits, risk diversification, and phasing constraints. Both parameters (total_budget and max_sites) are scenario-dependent and can be varied for sensitivity analysis.
# STEP 6 — Add Constraints
# Cannot be run more than once
model.constraints.clear()
# Budget constraint
total_budget = 300_000_000 # $300M
model += pulp.lpSum(
candidates.loc[i, "cost"] * x[i] for i in candidates.index
) <= total_budget, "BudgetConstraint"
# Maximum number of selected sites
max_sites = 500
model += pulp.lpSum(x[i] for i in candidates.index) <= max_sites, "MaxSitesConstraint"
min_sites = 100 # pick at least 100 parcels
model += pulp.lpSum(x[i] for i in candidates.index) >= min_sites, "MinSitesConstraint"
print("Step 6 complete — Constraints added.")Step 6 complete — Constraints added.
Solve the Optimization Model
PuLP’s interface utilizes the CBC MILP solver. CBC computes the integral constraints to solve a continuous LP, then applies iterations through branch-and-bound and cutting-plane techniques to enforce integrals and converge to an optimal or near-optimal solution.
Key outcomes to monitor: Solver status: “Optimal”, “Infeasible”, “Unbounded”
Runtime and node counts: Help confirm that the problem is of manageable size
In understandable terms: It is optimizing development scenarios per parcel.
# STEP 7 — Solve the Model
print("Solving optimization model...")
solution_status = model.solve(pulp.PULP_CBC_CMD(msg=True))
print("Solver status:", pulp.LpStatus[solution_status])
print("Step 7 complete — Model solved.")Solving optimization model...
Welcome to the CBC MILP Solver
Version: 2.10.3
Build Date: Dec 15 2019
command line - /Applications/anaconda3/envs/geospatial/lib/python3.13/site-packages/pulp/apis/../solverdir/cbc/osx/i64/cbc /var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/daad625ba7e0403f8427456d041214d5-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /var/folders/qr/r5tzr37x24vf63l6k50wvx_c0000gn/T/daad625ba7e0403f8427456d041214d5-pulp.sol (default strategy 1)
At line 2 NAME MODEL
At line 3 ROWS
At line 8 COLUMNS
At line 120009 RHS
At line 120013 BOUNDS
At line 140014 ENDATA
Problem MODEL has 3 rows, 20000 columns and 60000 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 1.19504e+08 - 0.06 seconds
Cgl0004I processed model has 2 rows, 19057 columns (19057 integer (18749 of which binary)) and 38114 elements
Cbc0038I Initial state - 2 integers unsatisfied sum - 0.0164205
Cbc0038I Pass 1: suminf. 0.00965 (2) obj. -1.19448e+08 iterations 3
Cbc0038I Pass 2: suminf. 0.28619 (2) obj. -1.19281e+08 iterations 1
Cbc0038I Solution found of -1.19281e+08
Cbc0038I Branch and bound needed to clear up 2 general integers
Cbc0038I Full problem 2 rows 19057 columns, reduced to 2 rows 264 columns
Cbc0038I Cleaned solution of -1.19344e+08
Cbc0038I Before mini branch and bound, 19052 integers at bound fixed and 0 continuous
Cbc0038I Mini branch and bound improved solution from -1.19344e+08 to -1.19344e+08 (0.38 seconds)
Cbc0038I Round again with cutoff of -1.1936e+08
Cbc0038I Reduced cost fixing fixed 9901 variables on major pass 2
Cbc0038I Pass 3: suminf. 0.00965 (2) obj. -1.19448e+08 iterations 0
Cbc0038I Pass 4: suminf. 0.06572 (3) obj. -1.1936e+08 iterations 5
Cbc0038I Solution found of -1.1936e+08
Cbc0038I Branch and bound needed to clear up 3 general integers
Cbc0038I Full problem 3 rows 19057 columns, reduced to 3 rows 29 columns
Cbc0038I Mini branch and bound could not fix general integers
Cbc0038I No solution found this major pass
Cbc0038I Before mini branch and bound, 19050 integers at bound fixed and 0 continuous
Cbc0038I Full problem 2 rows 19057 columns, reduced to 2 rows 6 columns
Cbc0038I Mini branch and bound did not improve solution (0.50 seconds)
Cbc0038I After 0.50 seconds - Feasibility pump exiting with objective of -1.19344e+08 - took 0.19 seconds
Cbc0012I Integer solution of -1.1934437e+08 found by feasibility pump after 0 iterations and 0 nodes (0.50 seconds)
Cbc0038I Full problem 2 rows 19057 columns, reduced to 2 rows 5 columns
Cbc0013I At root node, 0 cuts changed objective from -1.1950354e+08 to -1.1950354e+08 in 1 passes
Cbc0014I Cut generator 0 (Probing) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.008 seconds - new frequency is -100
Cbc0014I Cut generator 1 (Gomory) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.001 seconds - new frequency is -100
Cbc0014I Cut generator 2 (Knapsack) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.001 seconds - new frequency is -100
Cbc0014I Cut generator 3 (Clique) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100
Cbc0014I Cut generator 4 (MixedIntegerRounding2) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.003 seconds - new frequency is -100
Cbc0014I Cut generator 5 (FlowCover) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.001 seconds - new frequency is -100
Cbc0010I After 0 nodes, 1 on tree, -1.1934437e+08 best solution, best possible -1.1950354e+08 (1.16 seconds)
Cbc0012I Integer solution of -1.1941941e+08 found by DiveCoefficient after 32 iterations and 22 nodes (6.18 seconds)
Cbc0012I Integer solution of -1.1947726e+08 found by DiveCoefficient after 57 iterations and 40 nodes (7.42 seconds)
Cbc0038I Full problem 2 rows 19057 columns, reduced to 2 rows 6 columns
Cbc0038I Full problem 2 rows 19057 columns, reduced to 2 rows 2109 columns
Cbc0044I Reduced cost fixing - 2 rows, 2109 columns - restarting search
Cbc0012I Integer solution of -1.1947726e+08 found by Previous solution after 0 iterations and 0 nodes (7.57 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 6 columns
Cbc0031I 5 added rows had average density of 471.4
Cbc0013I At root node, 5 cuts changed objective from -1.1950354e+08 to -1.1950325e+08 in 10 passes
Cbc0014I Cut generator 0 (Probing) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.009 seconds - new frequency is -100
Cbc0014I Cut generator 1 (Gomory) - 1 row cuts average 1988.0 elements, 0 column cuts (0 active) in 0.004 seconds - new frequency is -100
Cbc0014I Cut generator 2 (Knapsack) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.001 seconds - new frequency is -100
Cbc0014I Cut generator 3 (Clique) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100
Cbc0014I Cut generator 4 (MixedIntegerRounding2) - 2 row cuts average 1988.0 elements, 0 column cuts (0 active) in 0.005 seconds - new frequency is 1
Cbc0014I Cut generator 5 (FlowCover) - 10 row cuts average 92.4 elements, 0 column cuts (0 active) in 0.005 seconds - new frequency is 1
Cbc0014I Cut generator 6 (TwoMirCuts) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.001 seconds - new frequency is -100
Cbc0014I Cut generator 7 (ZeroHalf) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.100 seconds - new frequency is -100
Cbc0010I After 0 nodes, 1 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (7.85 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 10 columns
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 10 columns
Cbc0010I After 100 nodes, 55 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (9.21 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 6 columns
Cbc0010I After 200 nodes, 102 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (9.40 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 6 columns
Cbc0010I After 300 nodes, 150 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (9.53 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 6 columns
Cbc0010I After 400 nodes, 199 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (9.66 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 7 columns
Cbc0010I After 500 nodes, 248 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (9.79 seconds)
Cbc0038I Full problem 2 rows 2109 columns, reduced to 2 rows 8 columns
Cbc0010I After 600 nodes, 296 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (9.92 seconds)
Cbc0010I After 700 nodes, 346 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (10.03 seconds)
Cbc0010I After 800 nodes, 396 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (10.14 seconds)
Cbc0010I After 900 nodes, 442 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (10.24 seconds)
Cbc0010I After 1000 nodes, 491 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (10.34 seconds)
Cbc0010I After 1100 nodes, 590 on tree, -1.1947726e+08 best solution, best possible -1.1950325e+08 (10.42 seconds)
Cbc0012I Integer solution of -1.1950175e+08 found by rounding after 2525 iterations and 1168 nodes (10.48 seconds)
Cbc0012I Integer solution of -1.1950247e+08 found by DiveCoefficient after 2578 iterations and 1184 nodes (10.52 seconds)
Cbc0010I After 1200 nodes, 8 on tree, -1.1950247e+08 best solution, best possible -1.1950325e+08 (10.53 seconds)
Cbc0001I Search completed - best objective -119502473.087746, took 2700 iterations and 1229 nodes (10.56 seconds)
Cbc0032I Strong branching done 1234 times (2621 iterations), fathomed 7 nodes and fixed 23 variables
Cbc0035I Maximum depth 287, 267316 variables fixed on reduced cost
Cbc0038I Probing was tried 10 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.009 seconds)
Cbc0038I Gomory was tried 10 times and created 1 cuts of which 0 were active after adding rounds of cuts (0.004 seconds)
Cbc0038I Knapsack was tried 10 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
Cbc0038I Clique was tried 10 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
Cbc0038I MixedIntegerRounding2 was tried 256 times and created 3 cuts of which 0 were active after adding rounds of cuts (0.068 seconds)
Cbc0038I FlowCover was tried 256 times and created 135 cuts of which 0 were active after adding rounds of cuts (0.054 seconds)
Cbc0038I TwoMirCuts was tried 10 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
Cbc0038I ZeroHalf was tried 10 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.100 seconds)
Cbc0012I Integer solution of -1.1950247e+08 found by Reduced search after 2773 iterations and 1279 nodes (10.57 seconds)
Cbc0001I Search completed - best objective -119502473.087746, took 2773 iterations and 1279 nodes (10.58 seconds)
Cbc0032I Strong branching done 170 times (233 iterations), fathomed 0 nodes and fixed 0 variables
Cbc0035I Maximum depth 24, 61999 variables fixed on reduced cost
Cuts at root node changed objective from -1.19504e+08 to -1.19504e+08
Probing was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.008 seconds)
Gomory was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
Knapsack was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
Clique was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.000 seconds)
MixedIntegerRounding2 was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.003 seconds)
FlowCover was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
TwoMirCuts was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
ZeroHalf was tried 1 times and created 0 cuts of which 0 were active after adding rounds of cuts (0.001 seconds)
Result - Optimal solution found
Objective value: 119502473.08774604
Enumerated nodes: 1279
Total iterations: 2773
Time (CPU seconds): 10.08
Time (Wallclock seconds): 10.61
Option for printingOptions changed from normal to all
Total time (CPU seconds): 10.16 (Wallclock seconds): 10.71
Solver status: Optimal
Step 7 complete — Model solved.
Extracting and Interpreting the Selected Redevelopment Sites
Once the solver finishes, each decision variable x i
has a value in { 0 , 1 } . Interpretation:
x(i) = 1: parcel (i) is selected in the optimal portfolio. x(i) = 0: parcel (i) is not selected.
A selected_parcels DataFrame is constructed containing:
Parcel identifiers (PARCEL_ID)
Physical characteristics (lot size, assumed buildable GFA)
Financial metrics (revenue, cost, net uplift)
This serves as the core output of the optimization approach and can be visualized and tabulated.
# STEP 8 — Extract Selected Parcels
candidates["selected"] = [pulp.value(x[i]) for i in candidates.index]
selected_parcels = candidates[candidates["selected"] > 0.5].copy()
print(f"Number of selected parcels: {len(selected_parcels)}")
selected_parcels[[
"PARCEL_ID", "lot_sqft", "buildable_gfa",
"revenue", "cost", "net_uplift"
]].head()Number of selected parcels: 100
| PARCEL_ID | lot_sqft | buildable_gfa | revenue | cost | net_uplift | |
|---|---|---|---|---|---|---|
| 363 | 1001684745 | 5633.257617 | 5633.257617 | 1.971640e+06 | 1.408314e+06 | 562325.761691 |
| 829 | 1001623058 | 383.996984 | 383.996984 | 1.343989e+05 | 9.599925e+04 | 30799.698380 |
| 974 | 1001167215 | 1180.751146 | 1180.751146 | 4.132629e+05 | 2.951878e+05 | 113475.114557 |
| 1405 | 1001674492 | 2285.637429 | 2285.637429 | 7.999731e+05 | 5.714094e+05 | 222263.742930 |
| 1476 | 1001674555 | 995.842675 | 995.842675 | 3.485449e+05 | 2.489607e+05 | 94084.267492 |
Results
Result - Optimal solution found Objective value: 119502473.08774604 Enumerated nodes: 1279 Total iterations: 2773
This means:
-Constraints were satisfied
-Budget wasn’t exceeded
-The solver found the best possible combination of parcels
-The solution is mathematically valid and complete
100 Parcels were selected
This is exactly what should happen from a sample of 20,000 given the prioritizations:
Budget = ~$300M
Max sites = 500
Only positive uplift parcels are considered
Many parcels have very low GFA
FAR assumption = 1.0
Costs & revenues produce realistic uplift estimates
5,000 binary variables is a big model, and it still solved ver efficiently.
Table
We can clearly see in this table that the mathematical model has done great work doing financial analysis per parcel and this data will be used moving forward to the final step of scoring each parcel.
# Professional formatted table using pandas Styler
styled_table = (
selected_parcels
.assign(
buildable_gfa=lambda d: d["buildable_gfa"].round(0).astype(int).map("{:,}".format),
revenue=lambda d: d["revenue"].map("${:,.0f}".format),
cost=lambda d: d["cost"].map("${:,.0f}".format),
net_uplift=lambda d: d["net_uplift"].map("${:,.0f}".format)
)[
["PARCEL_ID", "buildable_gfa", "revenue", "cost", "net_uplift"]
]
.sort_values("net_uplift", ascending=False)
.style.set_properties(**{
"text-align": "center",
"font-size": "14px"
})
.set_table_styles([
{"selector": "thead th", "props": [
("background-color", "#0F172A"),
("color", "white"),
("font-weight", "bold"),
("padding", "10px"),
("font-size", "15px")
]},
{"selector": "tbody td", "props": [
("padding", "8px"),
]}
])
)
styled_table| PARCEL_ID | buildable_gfa | revenue | cost | net_uplift | |
|---|---|---|---|---|---|
| 15748 | 1001148842 | 1,040 | $364,011 | $260,008 | $96,303 |
| 10543 | 1001075514 | 1,022 | $357,590 | $255,422 | $95,569 |
| 1476 | 1001674555 | 996 | $348,545 | $248,961 | $94,084 |
| 19881 | 1001127555 | 957 | $334,938 | $239,242 | $92,397 |
| 2579 | 1001484533 | 8,253 | $2,888,428 | $2,063,163 | $825,265 |
| 2755 | 1001401950 | 152 | $53,283 | $38,060 | $8,524 |
| 11652 | 1001128024 | 796 | $278,557 | $198,970 | $74,788 |
| 5471 | 1001075333 | 733 | $256,579 | $183,271 | $70,008 |
| 5054 | 1001674487 | 749 | $261,992 | $187,137 | $68,655 |
| 16465 | 1001445325 | 745 | $260,754 | $186,253 | $68,301 |
| 14951 | 1001212272 | 740 | $259,037 | $185,026 | $66,711 |
| 19923 | 1001113291 | 665,927 | $233,074,565 | $166,481,832 | $66,592,733 |
| 8364 | 1001535110 | 690 | $241,588 | $172,563 | $61,525 |
| 1647 | 1001248623 | 688 | $240,742 | $171,959 | $61,083 |
| 3067 | 1001168316 | 682 | $238,852 | $170,609 | $60,543 |
| 8908 | 1001502359 | 680 | $238,018 | $170,013 | $60,405 |
| 14479 | 1001684154 | 61,421 | $21,497,277 | $15,355,198 | $6,142,079 |
| 3990 | 1001348669 | 669 | $234,033 | $167,166 | $58,966 |
| 15959 | 1001383422 | 649 | $227,270 | $162,336 | $58,234 |
| 363 | 1001684745 | 5,633 | $1,971,640 | $1,408,314 | $562,326 |
| 4052 | 1001286124 | 619 | $216,521 | $154,658 | $55,163 |
| 9225 | 1001163664 | 627 | $219,562 | $156,830 | $55,132 |
| 2176 | 1001228409 | 5,277 | $1,846,891 | $1,319,208 | $527,683 |
| 10230 | 1001582492 | 588 | $205,637 | $146,884 | $52,254 |
| 1574 | 1001675580 | 5,187 | $1,815,529 | $1,296,806 | $515,523 |
| 7621 | 1001285140 | 580 | $202,860 | $144,900 | $51,060 |
| 6262 | 1001302094 | 561 | $196,393 | $140,281 | $48,612 |
| 7231 | 1001375630 | 503 | $176,062 | $125,758 | $47,703 |
| 13117 | 1001531261 | 546 | $190,927 | $136,376 | $47,651 |
| 12735 | 1001285128 | 522 | $182,576 | $130,412 | $46,365 |
| 19167 | 1001163510 | 519 | $181,743 | $129,816 | $46,226 |
| 10590 | 1001285136 | 516 | $180,745 | $129,104 | $45,842 |
| 5142 | 1001302107 | 530 | $185,509 | $132,507 | $45,503 |
| 3733 | 1001383416 | 497 | $174,004 | $124,289 | $43,315 |
| 19973 | 1001166637 | 497 | $174,043 | $124,317 | $43,227 |
| 4031 | 1001235402 | 484 | $169,307 | $120,934 | $42,574 |
| 9774 | 1001094157 | 489 | $171,025 | $122,161 | $42,264 |
| 15220 | 1001166635 | 486 | $169,967 | $121,405 | $42,062 |
| 1503 | 1001383428 | 484 | $169,481 | $121,058 | $42,023 |
| 11731 | 1001510501 | 4,183 | $1,464,088 | $1,045,777 | $411,211 |
| 17551 | 1001269332 | 459 | $160,574 | $114,695 | $40,578 |
| 14276 | 1001681864 | 45,741 | $16,009,208 | $11,435,148 | $4,569,059 |
| 15125 | 1001683800 | 40,340 | $14,118,888 | $10,084,920 | $4,033,968 |
| 11549 | 1001588395 | 3,933 | $1,376,389 | $983,135 | $393,254 |
| 5545 | 1001326376 | 466 | $163,204 | $116,574 | $39,430 |
| 14397 | 1001535154 | 444 | $155,543 | $111,102 | $39,141 |
| 15815 | 1001383446 | 453 | $158,402 | $113,144 | $38,058 |
| 10981 | 1001402701 | 461 | $161,277 | $115,198 | $37,979 |
| 19157 | 1001535156 | 431 | $150,767 | $107,691 | $37,776 |
| 4151 | 1001326367 | 441 | $154,359 | $110,256 | $37,603 |
| 9825 | 1001326358 | 436 | $152,512 | $108,937 | $36,375 |
| 18018 | 1001375212 | 3,496 | $1,223,466 | $873,904 | $349,562 |
| 19722 | 1001303381 | 3,514 | $1,229,896 | $878,497 | $344,199 |
| 7368 | 1001163684 | 391 | $136,713 | $97,652 | $33,961 |
| 11846 | 1001623086 | 388 | $135,815 | $97,011 | $31,904 |
| 9754 | 1001259368 | 380 | $132,865 | $94,904 | $31,862 |
| 18625 | 1001623082 | 381 | $133,472 | $95,337 | $31,235 |
| 18180 | 1001334208 | 3,101 | $1,085,313 | $775,224 | $303,589 |
| 9183 | 1001334216 | 3,101 | $1,085,313 | $775,224 | $303,189 |
| 1781 | 1001623063 | 384 | $134,422 | $96,016 | $30,806 |
| 829 | 1001623058 | 384 | $134,399 | $95,999 | $30,800 |
| 19446 | 1001235418 | 373 | $130,615 | $93,296 | $29,618 |
| 15428 | 1001348692 | 363 | $126,884 | $90,631 | $28,753 |
| 14190 | 1001442445 | 336 | $117,470 | $83,907 | $28,563 |
| 8092 | 1001264272 | 363 | $127,009 | $90,720 | $28,488 |
| 3838 | 1001487029 | 355 | $124,123 | $88,659 | $28,064 |
| 16963 | 1001167281 | 338 | $118,197 | $84,426 | $27,470 |
| 5022 | 1001685232 | 2,689 | $941,198 | $672,285 | $268,914 |
| 8030 | 1001622866 | 292 | $102,355 | $73,111 | $25,844 |
| 17453 | 1001262901 | 2,523 | $883,024 | $630,731 | $244,893 |
| 1405 | 1001674492 | 2,286 | $799,973 | $571,409 | $222,264 |
| 9576 | 1001674490 | 2,286 | $799,973 | $571,409 | $221,264 |
| 7166 | 1001342304 | 264 | $92,401 | $66,000 | $20,600 |
| 11422 | 1001430746 | 28,762 | $10,066,837 | $7,190,598 | $2,876,239 |
| 6408 | 1001178940 | 23,967 | $8,388,586 | $5,991,847 | $2,396,739 |
| 18578 | 1001051382 | 1,990 | $696,612 | $497,580 | $192,932 |
| 1855 | 1001527910 | 1,939 | $678,796 | $484,854 | $188,242 |
| 18025 | 1001683522 | 175 | $61,226 | $43,733 | $17,493 |
| 16867 | 1001151750 | 1,667 | $583,437 | $416,741 | $161,696 |
| 14078 | 1001554179 | 1,630 | $570,548 | $407,535 | $158,014 |
| 8011 | 1001554186 | 1,630 | $570,548 | $407,535 | $158,014 |
| 18460 | 1001541701 | 1,599 | $559,623 | $399,730 | $153,392 |
| 18699 | 1001561055 | 186 | $65,150 | $46,536 | $15,014 |
| 8227 | 1001146160 | 1,565 | $547,764 | $391,260 | $149,904 |
| 18641 | 1001608307 | 180 | $63,154 | $45,110 | $14,044 |
| 17352 | 1001561052 | 176 | $61,507 | $43,934 | $13,973 |
| 19704 | 1001480220 | 132,479 | $46,367,530 | $33,119,664 | $13,247,866 |
| 18121 | 1001428196 | 1,288 | $450,915 | $322,082 | $124,933 |
| 14559 | 1001537728 | 1,295 | $453,252 | $323,751 | $122,601 |
| 11167 | 1001051454 | 1,241 | $434,338 | $310,242 | $121,397 |
| 19568 | 1001627457 | 153 | $53,642 | $38,316 | $12,226 |
| 974 | 1001167215 | 1,181 | $413,263 | $295,188 | $113,475 |
| 19636 | 1001436294 | 177 | $61,862 | $44,187 | $11,675 |
| 2709 | 1001064573 | 18,030 | $6,310,386 | $4,507,418 | $1,798,567 |
| 15826 | 1001252983 | 16,168 | $5,658,792 | $4,041,994 | $1,616,798 |
| 2133 | 1001248925 | 15,882 | $5,558,737 | $3,970,526 | $1,588,211 |
| 15357 | 1001401262 | 15,200 | $5,320,059 | $3,800,042 | $1,520,017 |
| 8008 | 1001078334 | 12,258 | $4,290,365 | $3,064,546 | $1,225,818 |
| 18329 | 1001687615 | 10,445 | $3,655,811 | $2,611,294 | $1,044,518 |
| 2228 | 1001580049 | 10,230 | $3,580,579 | $2,557,556 | $1,019,722 |
Dash Board of the Parcels Selected for Redevelopment
Note
The model only computes 20000 parcels due to the size of the data sampling up will result in more parcel selections.
selected_polygons = selected_parcels.copy()
# Convert projection from EPSG:2272 (feet) → WGS84 (lat/lon)
selected_polygons_wgs = selected_polygons.to_crs(4326)
# Ensure net_uplift is numeric
selected_polygons_wgs["net_uplift"] = pd.to_numeric(
selected_polygons_wgs["net_uplift"], errors="coerce"
)
selected_polygons_wgs.hvplot.polygons(
geo=True,
tiles="CartoDark",
color="net_uplift",
cmap="YlGn",
line_color="white",
line_width=1.5,
alpha=0.8,
hover_cols=[
"PARCEL_ID",
"lot_sqft",
"buildable_gfa",
"revenue",
"cost",
"net_uplift"
],
title="Optimized Redevelopment Parcels",
width=900,
height=650
)Computing Variables for Optimization Score
In this stage of the project, a multi-dimensional redevelopment opportunity index for Philadelphia parcels is constructed. While the PuLP optimization produces the mathematically optimal redevelopment portfolio under financial constraints, this is only the initial filter to identify eligible parcels, we also require a qualitative and spatially sensitive scoring framework that evaluates parcels across several redevelopment-relevant dimensions. The goal is to create a Redevelopment Opportunity Score—a normalized, weighted composite of metrics capturing:
Physical Under-utilization Assesses whether land is being used below its potential. Parcels with small buildings relative to lot area often represent redevelopment opportunities.
Market Gap Evaluates discrepancies between market-implied value and tax-assessed value. Undervalued parcels may indicate strong redevelopment potential or overlooked market opportunity.
Structural Obsolescence (Old Structure Flag) Marks parcels with older buildings ,pre-1950, which are more likely to be physically obsolete, inefficient, or candidates for redevelopment.
Zoning Capacity Proxy Approximates allowed density using a simplified FAR assumption. Parcels with high buildable capacity relative to existing structures often present upzoning or high redevelopment opportunity.
Accessibility Potential Captures proximity to major intersections within the city’s street network. Closer proximity generally correlates with increased mobility, visibility, and development attractiveness.
Financial Potential (Net Uplift) Incorporates the earlier estimated uplift from redevelopment (revenue − cost − assessed value). This ensures the scoring system includes an explicit economic component.
Opportunity Score (Weighted Composite) The final index, after normalizing metrics 0–1 using MinMaxScaler and weighting them according to redevelopment relevance. This creates a continuous measure that can be mapped, filtered, ranked, and compared with PuLP outputs in the final dashboard.
Create Working Data Set
An dataset is produced specifically for scoring to ensure that no prior notebook state or variable reuse affects results. This dataset includes all core parcel attributes necessary for computing qualitative, spatial, and financial indicators.
# STEP 1 — Create Data Set For Scoring
qualitative_df = parcel_map_base.copy()
qualitative_df["BUILDING_SQFT"] = pd.to_numeric(qualitative_df["BUILDING_SQFT"], errors="coerce")
qualitative_df["ASSESSED_VALUE"] = pd.to_numeric(qualitative_df["ASSESSED_VALUE"], errors="coerce")
qualitative_df["YEAR_BUILT"] = pd.to_numeric(qualitative_df["YEAR_BUILT"], errors="coerce")
qualitative_df["lot_sqft"] = qualitative_df.geometry.area
# Recompute Financial Variables Needed for Scoring
# FAR assumption
far = 2.0
# Estimated buildable gross floor area
qualitative_df["buildable_gfa"] = qualitative_df["lot_sqft"] * far
# Market price per buildable sqft
market_price_per_sqft = 350
# Estimated revenue
qualitative_df["revenue"] = qualitative_df["buildable_gfa"] * market_price_per_sqft
# Construction cost assumption
cost_per_sqft = 250
# Estimated cost
qualitative_df["cost"] = qualitative_df["buildable_gfa"] * cost_per_sqft
# Net uplift definition
qualitative_df["net_uplift"] = (
qualitative_df["revenue"] -
qualitative_df["cost"] -
qualitative_df["ASSESSED_VALUE"].fillna(0)
)
print("Pre-step complete — Financial variables added to qualitative_df.")
print("Step 1 complete — Working dataset 'qualitative_df' ready.")Pre-step complete — Financial variables added to qualitative_df.
Step 1 complete — Working dataset 'qualitative_df' ready.
Computing Underutilization Ratio
This metric evaluates how effectively land is used relative to its physical footprint. Parcels with a low building-to-land ratio are often underdeveloped, making them attractive candidates for redevelopment.
# STEP 2 — Computing Underutilization Ratio
qualitative_df["underutilization"] = qualitative_df["BUILDING_SQFT"] / qualitative_df["lot_sqft"]
qualitative_df["underutilization"] = qualitative_df["underutilization"].fillna(0)
print("Step 2 complete — Underutilization ratio computed.")Step 2 complete — Underutilization ratio computed.
Computing Market Gap
The market gap metric identifies parcels where market-implied value (using the revenue proxy) exceeds assessed value. A high ratio suggests the property may be undervalued or inefficiently used, signalling market-driven redevelopment potential.
# STEP 3 — Compute Market Gap
qualitative_df["market_gap"] = qualitative_df["revenue"] / qualitative_df["ASSESSED_VALUE"].replace(0, np.nan)
qualitative_df["market_gap"] = qualitative_df["market_gap"].fillna(0)
print("Step 3 complete — Market gap computed.")Step 3 complete — Market gap computed.
Old Structure Variable
Older buildings are more likely to be obsolete, structurally inefficient, or out of sync with current zoning and market expectations. Parcels with pre-1950 structures often represent high redevelopment potential.
# STEP 4 — Old Structure Flag
qualitative_df["old_structure"] = (qualitative_df["YEAR_BUILT"] < 1950).astype(int)
print("Step 4 complete — Old structure flag added.")Step 4 complete — Old structure flag added.
Zoning Capacity Proxy
Since citywide zoning joins are computationally intensive, a generalized FAR assumption is implemented to approximate theoretical development capacity. Parcels with high zoning capacity relative to their existing improvements often hold latent potential.
# STEP 5 — Zoning Capacity Proxy
max_far_proxy = 3.0
qualitative_df["zoning_capacity"] = qualitative_df["lot_sqft"] * max_far_proxy
print("Step 5 complete — Zoning capacity proxy computed.")Step 5 complete — Zoning capacity proxy computed.
Accessibility Score Using osmnx
Accessibility is a fundamental component of redevelopment potential. Instead of performing computationally expensive network routing, we estimate accessibility via the inverse distance from each parcel centroid to the nearest major street-network node. This provides a meaningful, fast, and scalable measure of urban connectivity.
# STEP 6 — Accessability Score
# This method replaces OSMnx nearest-node
# Instead of snapping each parcel to all nodes in the street graph,
# 1. Download the drivable network for Philadelphia
# 2. Extract only major intersections (nodes with degree ≥ 3),
# 3. Build a KDTree for nearest-neighbor search
# 4. Compute distance from each parcel centroid to the closest major intersection
# 5. Convert this distance to an accessibility score via 1/distance.
# Compute centroids
qualitative_df["centroid"] = qualitative_df.geometry.centroid
# Convert centroids to WGS84 for OSMnx
qualitative_df_wgs = qualitative_df.set_geometry("centroid").to_crs(4326)
# Download drivable OSM network for Philadelphia
G = ox.graph_from_place("Philadelphia, Pennsylvania, USA", network_type="drive")
# Extract nodes as a GeoDataFrame
nodes = ox.graph_to_gdfs(G, edges=False)
# Identify major intersection" = nodes with degree > = 3
degree_dict = dict(G.degree())
major_nodes = nodes[nodes.index.map(lambda n: degree_dict.get(n, 0) >= 3)]
# Build KDTree for nearest neighbor search
major_points = np.vstack([major_nodes["x"].values, major_nodes["y"].values]).T
tree = cKDTree(major_points)
# Parcel centroid coordinates
parcel_points = np.vstack([
qualitative_df_wgs.geometry.x.values,
qualitative_df_wgs.geometry.y.values
]).T
# Compute nearest major intersection distance
distances, _ = tree.query(parcel_points, k=1)
# Accessibility = higher when closer to intersections
qualitative_df["accessibility_score"] = 1 / (distances + 1e-6)
print("Step 6 complete — Major-intersection accessibility score computed.")# STEP 6 — Accessability Score
qualitative_df["centroid"] = qualitative_df.geometry.centroid
qualitative_df_wgs = qualitative_df.set_geometry("centroid").to_crs(4326)
G = ox.graph_from_place("Philadelphia, Pennsylvania, USA", network_type="drive")
nodes = ox.graph_to_gdfs(G, edges=False)
degree_dict = dict(G.degree())
major_nodes = nodes[nodes.index.map(lambda n: degree_dict.get(n, 0) >= 3)]
major_nodes = major_nodes.dropna(subset=["x", "y"])
major_points = np.vstack([major_nodes["x"].values, major_nodes["y"].values]).T
tree = cKDTree(major_points)
parcel_points = np.vstack([
qualitative_df_wgs.geometry.x.fillna(0).values,
qualitative_df_wgs.geometry.y.fillna(0).values
]).T
distances, _ = tree.query(parcel_points, k=1)
qualitative_df["accessibility_score"] = 1 / (distances + 1e-6)
# STEP 7 — Normalization
scaler = MinMaxScaler()
metrics_to_scale = qualitative_df[[
"underutilization",
"market_gap",
"old_structure",
"zoning_capacity",
"accessibility_score",
"net_uplift"
]]
scaled = scaler.fit_transform(metrics_to_scale)
scaled_df = pd.DataFrame(
scaled,
columns=[col + "_norm" for col in metrics_to_scale.columns],
index=qualitative_df.index
)
qualitative_df = pd.concat([qualitative_df, scaled_df], axis=1)
# STEP 2 — Weighted Score
qualitative_df["opportunity_score"] = (
0.25 * qualitative_df["underutilization_norm"] +
0.20 * qualitative_df["market_gap_norm"] +
0.10 * qualitative_df["old_structure_norm"] +
0.15 * qualitative_df["zoning_capacity_norm"] +
0.10 * qualitative_df["accessibility_score_norm"] +
0.20 * qualitative_df["net_uplift_norm"]
)Normalize Metrics Using MinMaxScaler and Calculate Weighted Opportunity Score
Each qualitative indicator is measured in different units, (sqft, dollars, percents, binaries, etc.). To make them comparable and suitable for weighted scoring, they are normalized to the same 0–1 scale using MinMaxScaler. This prevents any single metric from overpowering the composite Opportunity Score.
Redevelopment indicators exist on different scales and magnitudes. Normalization transforms them into a uniform 0–1 range, ensuring comparability and enabling weighted scoring without bias toward metrics with larger numeric ranges
The Opportunity Score is a composite redevelopment metric constructed from weighted normalized indicators. Weights are assigned based on redevelopment relevance:
- Under-utilization (25%)
- Market gap (20%)
- Old structure (10%)
- Zoning capacity proxy (15%)
- Accessibility (10%)
- Financial uplift (20%)
This produces a continuous index ranging from 0 to 1, allowing for ranking, mapping, and comparison with the optimization model. Higher values indicate stronger redevelopment potential under qualitative criteria.
We aggregate all normalized indicators into a composite Opportunity Score reflecting multiple redevelopment dimensions. This creates a single interpretable measure for ranking parcels, building dashboard filters, and comparing with PuLP optimization outcomes.
# STEP 1 — Normalize Metrics
# Initialize scaler
scaler = MinMaxScaler()
# Select metrics to normalize
metrics_to_scale = qualitative_df[[
"underutilization",
"market_gap",
"old_structure",
"zoning_capacity",
"accessibility_score",
"net_uplift"
]]
# Fit-transform metrics to 0–1 range
scaled = scaler.fit_transform(metrics_to_scale)
# Store normalized metrics in a new Data Frame
scaled_df = pd.DataFrame(
scaled,
columns=[col + "_norm" for col in metrics_to_scale.columns],
index=qualitative_df.index
)
# Add normalized metrics to original dataset
qualitative_df = pd.concat([qualitative_df, scaled_df], axis=1)
print("Step 1 complete — Metrics normalized.")# STEP 2 — Compute Weighted Opportunity Score
qualitative_df["opportunity_score"] = (
0.25 * qualitative_df["underutilization_norm"] +
0.20 * qualitative_df["market_gap_norm"] +
0.10 * qualitative_df["old_structure_norm"] +
0.15 * qualitative_df["zoning_capacity_norm"] +
0.10 * qualitative_df["accessibility_score_norm"] +
0.20 * qualitative_df["net_uplift_norm"]
)
print("Step 2 complete — Opportunity Score computed.")This Concludes all Data Analysis and Computations, the final piece of the tool is the visualization dashboard.
Analytics Business Intelligence Dashboard
# Centroid Preparation - Polygons do not work
parcel_map_base["centroid"] = parcel_map_base.geometry.centroid
qualitative_df["centroid"] = qualitative_df.geometry.centroid
parcel_pts = gpd.GeoDataFrame(
parcel_map_base.drop(columns="geometry"),
geometry=parcel_map_base["centroid"],
crs="EPSG:2272"
)
qual_pts = gpd.GeoDataFrame(
qualitative_df.drop(columns="geometry"),
geometry=qualitative_df["centroid"],
crs="EPSG:2272"
)
parcel_pts_3857 = parcel_pts.to_crs(epsg=3857)
qual_pts_3857 = qual_pts.to_crs(epsg=3857)
parcel_pts_3857["x"] = parcel_pts_3857.geometry.x
parcel_pts_3857["y"] = parcel_pts_3857.geometry.y
qual_pts_3857["x"] = qual_pts_3857.geometry.x
qual_pts_3857["y"] = qual_pts_3857.geometry.y
print("Centroid prep complete — ready for dashboard (with sampling).")
# Base Tiles
tiles = hv.element.tiles.CartoDark().opts(width=900, height=600)
# Sample Function
def sample_df(df, n=20000):
if len(df) > n:
return df.sample(n, random_state=42)
return df
# Opportunity Score - First Tab
def opportunity_map():
df = sample_df(qual_pts_3857, n=20000)
pts = df.hvplot.points(
x="x", y="y",
color="opportunity_score",
cmap="Viridis",
size=5, alpha=0.7,
hover_cols=["PARCEL_ID", "opportunity_score"],
geo=False, width=900, height=600,
title="Redevelopment Opportunity Score"
)
return tiles * pts
# Explorer Page - Second Tab
attr_select = pn.widgets.Select(
name="Attribute",
options=[
"ASSESSED_VALUE","SALE_PRICE","VACANT_FLAG",
"PERMIT_COUNT","lot_sqft","building_sqft","year_built"
],
value="ASSESSED_VALUE"
)
def explorer_map(attr):
df = sample_df(parcel_pts_3857, n=20000)
pts = df.hvplot.points(
x="x", y="y",
color=attr,
cmap="YlGn",
size=5, alpha=0.7,
hover_cols=["PARCEL_ID", attr],
geo=False, width=900, height=600,
title=f"Parcels Colored by {attr} (20,000 sample)"
)
return tiles * pts
explorer_panel = pn.bind(explorer_map, attr=attr_select)
# Build Tabs
tab1 = pn.Column(
"Opportunity Score Map",
opportunity_map()
)
tab2 = pn.Column(
"Parcel Explorer",
attr_select,
explorer_panel
)
dashboard = pn.Tabs(
("Opportunity Score", tab1),
("Parcel Explorer", tab2)
)
dashboardCentroid prep complete — ready for dashboard (with sampling).